#!/usr/bin/env python
#coding:utf-8
#解析HTML文档，并提取其中的http链接

import HTMLParser
from sgmllib import SGMLParser
import urllib

#此类暂时不能工作，不知原因若何！
class ParseLinks(HTMLParser.HTMLParser):
    def handle_startag(self, tag, attrs):
        if tag == 'a':
            for name, value in attrs:
                if name == 'href':
                    print value

        print self.get_starttag_text()

class URLLister(SGMLParser):
    def reset(self):
        SGMLParser.reset(self)
        self.url = []

    def start_a(self, attrs):
        href = [v for k, v in attrs if k=='href']
        if href:
            self.url.extend(href)

#不能工作            
#IParser = ParseLinks()
IParser = URLLister()
IParser.feed(urllib.urlopen('http://www.python.org/').read())

for url in IParser.url:
    print url
